- name: kubernetes.extended-monitoring.application-controllers
  rules:
  - alert: KubernetesDeploymentReplicasUnavailable
    expr: |
      kube_deployment_status_replicas_unavailable
      > on (namespace, deployment)
      (
        max by (namespace, deployment) (extended_monitoring_deployment_threshold{threshold="replicas-not-ready"})
        + on (namespace, deployment)
        kube_deployment_spec_strategy_rollingupdate_max_unavailable
      )
    for: 5m
    labels:
      severity_level: "6"
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      plk_grouped_by__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      summary: |-
        Count of unavailable replicas in Deployment {{$labels.namespace}}/{{$labels.deployment}} is violating "spec.strategy.rollingupdate.maxunavailable".
      description: |-
        Count of unavailable replicas in Deployment {{$labels.namespace}}/{{$labels.deployment}} is violating "spec.strategy.rollingupdate.maxunavailable".

        Currently at: {{ .Value }} unavailable replica(s)
        Threshold at: {{ printf "extended_monitoring_deployment_threshold{threshold=\"replicas-not-ready\", namespace=\"%s\", deployment=\"%s\"}" $labels.namespace $labels.deployment | query | first | value }} unavailable replica(s)

        List of unavailable Pod(s): {{range $index, $result := (printf "(max by (namespace, pod) (kube_pod_status_ready{namespace=\"%s\", condition!=\"true\"} == 1)) * on (namespace, pod) kube_controller_pod{namespace=\"%s\", controller_type=\"Deployment\", controller_name=\"%s\"}" $labels.namespace $labels.namespace $labels.deployment | query)}}{{if not (eq $index 0)}}, {{ end }}{{ $result.Labels.pod }}{{ end }}

  - alert: KubernetesDeploymentReplicasUnavailable
    expr: |
      (
        (kube_deployment_status_replicas_available == 0) * (kube_deployment_spec_replicas != 0)
      )
      * on (namespace, deployment)
      (
        max by (namespace, deployment) (extended_monitoring_deployment_threshold{threshold="replicas-not-ready"})
      )
    for: 5m
    labels:
      severity_level: "5"
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      plk_grouped_by__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      summary: |-
        Count of available replicas in Deployment {{$labels.namespace}}/{{$labels.deployment}} is at zero.
      description: |-
        Count of available replicas in Deployment {{$labels.namespace}}/{{$labels.deployment}} is at zero.

        List of unavailable Pod(s): {{range $index, $result := (printf "(max by (namespace, pod) (kube_pod_status_ready{namespace=\"%s\", condition!=\"true\"} == 1)) * on (namespace, pod) kube_controller_pod{namespace=\"%s\", controller_type=\"Deployment\", controller_name=\"%s\"}" $labels.namespace $labels.namespace $labels.deployment | query)}}{{if not (eq $index 0)}}, {{ end }}{{ $result.Labels.pod }}{{ end }}

  - alert:  KubernetesDaemonSetReplicasUnavailable
    expr: |
      kube_daemonset_status_number_unavailable
      > on (namespace, daemonset)
      (
        max by (namespace, daemonset) (extended_monitoring_daemonset_threshold{threshold="replicas-not-ready"})
      )
    for: 5m
    labels:
      severity_level: "6"
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      plk_grouped_by__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      summary: |-
        Count of unavailable replicas in DaemonSet {{$labels.namespace}}/{{$labels.daemonset}} is above threshold.
      description: |-
        Count of unavailable replicas in DaemonSet {{$labels.namespace}}/{{$labels.daemonset}} is above threshold.
        Currently at: {{ .Value }} unavailable replica(s)
        Threshold at: {{ printf "extended_monitoring_daemonset_threshold{threshold=\"replicas-not-ready\", namespace=\"%s\", daemonset=\"%s\"}" $labels.namespace $labels.daemonset | query | first | value }} unavailable replica(s)

        List of unavailable Pod(s): {{range $index, $result := (printf "(max by (namespace, pod) (kube_pod_status_ready{namespace=\"%s\", condition!=\"true\"} == 1)) * on (namespace, pod) kube_controller_pod{namespace=\"%s\", controller_type=\"DaemonSet\", controller_name=\"%s\"}" $labels.namespace $labels.namespace $labels.daemonset | query)}}{{if not (eq $index 0)}}, {{ end }}{{ $result.Labels.pod }}{{ end }}

        This command might help figuring out problematic nodes given you are aware where the DaemonSet should be scheduled in the first place (using label selector for pods might be of help, too):

        ```
        kubectl -n {{$labels.namespace}} get pod -ojson | jq -r '.items[] | select(.metadata.ownerReferences[] | select(.name =="{{$labels.daemonset}}")) | select(.status.phase != "Running" or ([ .status.conditions[] | select(.type == "Ready" and .status == "False") ] | length ) == 1 ) | .spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[].matchFields[].values[]'
        ```


  - alert: KubernetesDaemonSetReplicasUnavailable
    expr: |
      (
        (kube_daemonset_status_number_available == 0) * (kube_daemonset_status_desired_number_scheduled != 0)
      )
      * on (namespace, daemonset)
      (
        max by (namespace, daemonset) (extended_monitoring_daemonset_threshold{threshold="replicas-not-ready"})
      )
    for: 5m
    labels:
      severity_level: "5"
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      plk_grouped_by__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      summary: |-
        Count of available replicas in DaemonSet {{$labels.namespace}}/{{$labels.daemonset}} is at zero.
      description: |-
        Count of available replicas in DaemonSet {{$labels.namespace}}/{{$labels.daemonset}} is at zero.

        List of unavailable Pod(s): {{range $index, $result := (printf "(max by (namespace, pod) (kube_pod_status_ready{namespace=\"%s\", condition!=\"true\"} == 1)) * on (namespace, pod) kube_controller_pod{namespace=\"%s\", controller_type=\"DaemonSet\", controller_name=\"%s\"}" $labels.namespace $labels.namespace $labels.daemonset | query)}}{{if not (eq $index 0)}}, {{ end }}{{ $result.Labels.pod }}{{ end }}

        This command might help figuring out problematic nodes given you are aware where the DaemonSet should be scheduled in the first place (using label selector for pods might be of help, too):

        ```
        kubectl -n {{$labels.namespace}} get pod -ojson | jq -r '.items[] | select(.metadata.ownerReferences[] | select(.name =="{{$labels.daemonset}}")) | select(.status.phase != "Running" or ([ .status.conditions[] | select(.type == "Ready" and .status == "False") ] | length ) == 1 ) | .spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[].matchFields[].values[]'
        ```

  - alert: KubernetesDaemonSetNotUpToDate
    expr: |
      max by (namespace, daemonset) (kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_updated_number_scheduled) > 0
    for: 15m
    labels:
      severity_level: "9"
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      plk_grouped_by__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      summary: |-
        There are {{ .Value }} outdated Pods in the {{ $labels.namespace }}/{{ $labels.daemonset }} DaemonSet for the last 15 minutes.
      description: |-
        There are {{ .Value }} outdated Pods in the {{ $labels.namespace }}/{{ $labels.daemonset }} DaemonSet for the last 15 minutes.

        The recommended course of action:
        1. Check the DaemonSet's status: `kubectl -n {{ $labels.namespace }} get ds {{ $labels.daemonset }}`
        2. Analyze the DaemonSet's description: `kubectl -n {{ $labels.namespace }} describe ds {{ $labels.daemonset }}`
        3. If the `Number of Nodes Scheduled with Up-to-date Pods` parameter does not match
        `Current Number of Nodes Scheduled`, check the DaemonSet's updateStrategy:
        `kubectl -n {{ $labels.namespace }} get ds {{ $labels.daemonset }} -o json | jq '.spec.updateStrategy'`
        4. Note that if the OnDelete updateStrategy is set, the DaemonSet gets only updated when Pods are deleted.

  - alert: KubernetesStatefulSetReplicasUnavailable
    expr: |
      (
        kube_statefulset_status_replicas - kube_statefulset_status_replicas_ready
      )
      > on (namespace, statefulset)
      (
        max by (namespace, statefulset) (extended_monitoring_statefulset_threshold{threshold="replicas-not-ready"})
      )
    for: 5m
    labels:
      severity_level: "6"
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      plk_grouped_by__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      summary: |-
        Count of unavailable replicas in StatefulSet {{$labels.namespace}}/{{$labels.statefulset}} above threshold.
      description: |-
        Count of unavailable replicas in StatefulSet {{$labels.namespace}}/{{$labels.statefulset}} above threshold.

        Currently at: {{ .Value }} unavailable replica(s)
        Threshold at: {{ printf "extended_monitoring_statefulset_threshold{threshold=\"replicas-not-ready\", namespace=\"%s\", statefulset=\"%s\"}" $labels.namespace $labels.statefulset | query | first | value }} unavailable replica(s)

        List of unavailable Pod(s): {{range $index, $result := (printf "(max by (namespace, pod) (kube_pod_status_ready{namespace=\"%s\", condition!=\"true\"} == 1)) * on (namespace, pod) kube_controller_pod{namespace=\"%s\", controller_type=\"StatefulSet\", controller_name=\"%s\"}" $labels.namespace $labels.namespace $labels.deployment | query)}}{{if not (eq $index 0)}}, {{ end }}{{ $result.Labels.pod }}{{ end }}

  - alert: KubernetesStatefulSetReplicasUnavailable
    expr: |
      (
        kube_statefulset_status_replicas_ready == 0
      )
      * on (namespace, statefulset)
      (
        max by (namespace, statefulset) (extended_monitoring_statefulset_threshold{threshold="replicas-not-ready"})
      )
    for: 5m
    labels:
      severity_level: "5"
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      plk_grouped_by__controllers_malfunctioning: "KubernetesControllersMalfunctioningInNamespace,prometheus=deckhouse,namespace={{ $labels.namespace }},kubernetes=~kubernetes"
      summary: |-
        Count of ready replicas in StatefulSet {{$labels.namespace}}/{{$labels.statefulset}} at zero.
      description: |-
        Count of ready replicas in StatefulSet {{$labels.namespace}}/{{$labels.statefulset}} at zero.

        List of unavailable Pod(s): {{range $index, $result := (printf "(max by (namespace, pod) (kube_pod_status_ready{namespace=\"%s\", condition!=\"true\"} == 1)) * on (namespace, pod) kube_controller_pod{namespace=\"%s\", controller_type=\"StatefulSet\", controller_name=\"%s\"}" $labels.namespace $labels.namespace $labels.deployment | query)}}{{if not (eq $index 0)}}, {{ end }}{{ $result.Labels.pod }}{{ end }}
